In [99]:
import pandas as pd
import polars as pl
import polars.selectors as cs
from catboost import Pool, CatBoostClassifier
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import tempfile
import os
import zipfile
import shap
import plotly.io as pio
import plotly as plotly

plotly.offline.init_notebook_mode()
pio.renderers.default = "vscode+notebook"
In [100]:
def read_dataset_export(file_name, src_folder=".",
                        tmp_folder=None,
                        lazy=False,
                        verbose=False):
    json_file = None
    error_reason = ""
    tmp_folder = tmp_folder if tmp_folder else tempfile.gettempdir()

    if file_name.endswith(".json"):
        error_reason = "Error reading JSON file"
        if os.path.exists(file_name):
            json_file = file_name
        elif os.path.exists(os.path.join(src_folder, file_name)):
            json_file = os.path.join(src_folder, file_name)
        if json_file and verbose:
            print(error_reason, json_file)
        if json_file:
            if lazy:
                multi_line_json = pl.scan_ndjson(json_file)
            else:
                multi_line_json = pl.read_ndjson(json_file)

    else:
        zip_file = file_name
        if file_name.endswith(".zip"):
            error_reason = "Error reading ZIP file"
            if os.path.exists(file_name):
                zip_file = file_name
            elif os.path.exists(os.path.join(src_folder, file_name)):
                zip_file = os.path.join(src_folder, file_name)
            if verbose:
                print(error_reason, zip_file)

            if os.path.exists(zip_file):
                error_reason = "Error extracting data.json"
                if verbose:
                    print(error_reason, zip_file)

                json_file = os.path.join(tmp_folder, "data.json")
                if os.path.exists(json_file):
                    os.remove(json_file)

                with zipfile.ZipFile(zip_file, 'r') as zip_ref:
                    all_zip_entries = zip_ref.namelist()
                    json_file_in_zip = [s for s in all_zip_entries if "data.json" in s]
                    if verbose:
                        print("data.json in zip file:", json_file_in_zip, zip_file)

                    for file in json_file_in_zip:
                        zip_ref.extract(file, tmp_folder)
                        json_file = os.path.join(tmp_folder, file)

                if not os.path.exists(json_file):
                    raise Exception(f"Dataset zipfile {zip_file} does not have \"data.json\"")
                if lazy:
                    multi_line_json = pl.scan_ndjson(json_file, infer_schema_length=100000)
                else:
                    multi_line_json = pl.read_ndjson(json_file, infer_schema_length=100000)
                    os.remove(json_file)

    if json_file is None:
        raise Exception(f"Dataset export not found {error_reason}")
    return multi_line_json

Read and Pre-process data¶

In [101]:
df = read_dataset_export( "Web_ClickThrough.zip", lazy=True, verbose=True)
df.describe()
Error reading ZIP file Web_ClickThrough.zip
Error extracting data.json Web_ClickThrough.zip
data.json in zip file: ['data.json'] Web_ClickThrough.zip
Out[101]:
shape: (9, 81)
statisticpyModelEvidenceDecision_InteractionIDCustomer_IsInArrearsCustomer_IsProspectIH_Retail_Inbound_Impression_pxLastGroupIDCustomer_IsActiveMilitaryServiceCustomer_IsInCollectionsCustomer_IsStudentIH_Web_Inbound_Impression_pyHistoricalOutcomeCountCustomer_IsBankruptcypyModelPerformanceDecision_DecisionTimeCustomer_NumCreditCardAccountCustomer_DebtToIncomeRatioCustomer_IsInDisasterAreaDecision_RankCustomer_IsCreditScoreStalepyPropensityCustomer_NumDepositAccountCustomer_PrimaryStateCustomer_IsInActiveComplaintnegativeSamplingCustomer_ResidentialStatusCustomer_PrimaryMobilePhoneCustomer_IsFinanciallyVulnerableContext_GroupIH_Web_Inbound_Clicked_pxLastGroupIDDecision_OutcomeWeightParam_JourneyCustomer_IsB2CidContext_IssueCustomer_MKTCLVValueCustomer_NetWealthCustomer_TotalLiabilitiesCustomer_OrganizationName…Context_NameParam_JourneyStageParam_LastJourneyStageCustomer_IsInPrecollectionsCustomer_HasCriticalIllnessdataCenterpositiveSamplingIH_Web_Inbound_Impression_pxLastGroupIDIH_Web_Inbound_Clicked_pyHistoricalOutcomeCountIH_Retail_Inbound_Impression_pyHistoricalOutcomeCountCustomer_OwnershipStatusDecision_OutcomeCustomer_AnnualIncomeParam_PriorStageInJourneyContext_TreatmentCustomer_LinkedInCustomer_PrefixIH_Web_Inbound_Impression_pxLastOutcomeTime_DaysSinceIH_Retail_Inbound_Impression_pxLastOutcomeTime_DaysSinceDecision_SubjectIDCustomer_PrimaryCountryCodeCustomer_NumInvestmentAccountCustomer_PrimaryPostalCoderulesetNamerulesetVersionCustomer_IsCustomerActiveCustomer_IsIncarceratedCustomer_OrganizationIDpyModelPositivesCustomer_NumLoanAccountCustomer_BirthDateCustomer_PrimaryCityParam_DaysinCurrentStageIH_Web_Inbound_Clicked_pxLastOutcomeTime_DaysSinceContext_DirectionContext_ChannelCustomer_OwnedAccountTypes
strstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrf64strstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstr…strstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstr
"count""32091""32091""32091""32091""162""32091""32091""32091""32091""32091""32091""32091""32091""32091""32091""32091""32091"32091.0"32091""32091""32091""32091""32091""32091""32091""32091""32091""32091""32091""32091""32091""32091""32091""32091""32091""32091"…"32091""32091""32091""32091""32091""32091""32091""32091""32091""162""32091""32091""32091""32091""32091""32091""32091""32091""162""32091""32091""32091""32091""32091""32091""32091""32091""32091""32091""32091""32091""32091""32091""32091""32091""32091""9513"
"null_count""0""0""0""0""31929""0""0""0""0""0""0""0""0""0""0""0""0"0.0"0""0""0""0""0""0""0""0""0""0""0""0""0""0""0""0""0""0"…"0""0""0""0""0""0""0""0""0""31929""0""0""0""0""0""0""0""0""31929""0""0""0""0""0""0""0""0""0""0""0""0""0""0""0""0""0""22578"
"mean"nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull0.599699nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull…nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull
"std"nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull0.271621nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull…nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull
"min""0.0""-3042656458600205216""""""Creditcards""""""""10.0""""0.5""20250504T082914.693 GMT""""""""1.0"""0.127551"""""""100.0""""""""Creditcards""Creditcards""1.0""""""00005377-e1cc-560d-9d32-f04544…"Grow"""""""""…"VisaClassic""""""""""datacenter1""100.0""Creditcards""10.0""1.0""""Clicked""""""Hero Web""""""0.000006041666666666667""22.910304131944443""C-000""""""""NBA-Artifacts""01-01-01""""""""0.0""""""""0.0""0.000005520833333333333""Inbound""Web"""
"25%"nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull0.418575nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull…nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull
"50%"nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull0.570433nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull…nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull
"75%"nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull0.863868nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull…nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull
"max""9996.0""-3042656458600399942""""""Creditcards""""false""true""9.0""""0.8475515053314687""20250505T084203.389 GMT""5.0""9.0""""1.0"""0.997062"2.0""""""100.0""""""""Creditcards""Creditcards""1.0""""""fffe469a-e880-5b2b-bc19-b281ee…"Grow""""990953.0""987467.0"""…"VisaClassic""""""""""datacenter1""100.0""Creditcards""9.0""9.0""""NoResponse""99769.0""""Hero Web""""""1.8764207060185185""23.951360659722223""C-999""""1.0""""NBA-Artifacts""01-01-01""""""""9998.0""2.0""0.36253927083453164""""0.0""1.853330787037037""Inbound""Web""Loan, Loan, Loan, Loan"
In [102]:
columns = df.collect_schema().names()
columns.sort()
columns
Out[102]:
['Context_Channel',
 'Context_Direction',
 'Context_Group',
 'Context_Issue',
 'Context_Name',
 'Context_Treatment',
 'Customer_AnnualIncome',
 'Customer_BirthDate',
 'Customer_CLV',
 'Customer_CreditScore',
 'Customer_DebtToIncomeRatio',
 'Customer_HasBrokenPromise',
 'Customer_HasCriticalIllness',
 'Customer_IsActiveMilitaryService',
 'Customer_IsB2C',
 'Customer_IsBankruptcy',
 'Customer_IsCreditScoreStale',
 'Customer_IsCustomerActive',
 'Customer_IsFinanciallyVulnerable',
 'Customer_IsInActiveComplaint',
 'Customer_IsInArrears',
 'Customer_IsInCollections',
 'Customer_IsInDisasterArea',
 'Customer_IsInPrecollections',
 'Customer_IsIncarcerated',
 'Customer_IsProspect',
 'Customer_IsStudent',
 'Customer_LinkedIn',
 'Customer_MKTCLVValue',
 'Customer_NetWealth',
 'Customer_NumCreditCardAccount',
 'Customer_NumDepositAccount',
 'Customer_NumInvestmentAccount',
 'Customer_NumLoanAccount',
 'Customer_OrganizationID',
 'Customer_OrganizationName',
 'Customer_OwnedAccountTypes',
 'Customer_OwnershipStatus',
 'Customer_Prefix',
 'Customer_PrimaryCity',
 'Customer_PrimaryCountry',
 'Customer_PrimaryCountryCode',
 'Customer_PrimaryMobilePhone',
 'Customer_PrimaryPostalCode',
 'Customer_PrimaryState',
 'Customer_RelationshipLengthDays',
 'Customer_ResidentialStatus',
 'Customer_TotalAssets',
 'Customer_TotalLiabilities',
 'Decision_DecisionTime',
 'Decision_InteractionID',
 'Decision_Outcome',
 'Decision_OutcomeTime',
 'Decision_OutcomeWeight',
 'Decision_Rank',
 'Decision_SubjectID',
 'IH_Retail_Inbound_Impression_pxLastGroupID',
 'IH_Retail_Inbound_Impression_pxLastOutcomeTime_DaysSince',
 'IH_Retail_Inbound_Impression_pyHistoricalOutcomeCount',
 'IH_Web_Inbound_Clicked_pxLastGroupID',
 'IH_Web_Inbound_Clicked_pxLastOutcomeTime_DaysSince',
 'IH_Web_Inbound_Clicked_pyHistoricalOutcomeCount',
 'IH_Web_Inbound_Impression_pxLastGroupID',
 'IH_Web_Inbound_Impression_pxLastOutcomeTime_DaysSince',
 'IH_Web_Inbound_Impression_pyHistoricalOutcomeCount',
 'Param_DaysinCurrentStage',
 'Param_Journey',
 'Param_JourneyStage',
 'Param_LastJourneyStage',
 'Param_PriorStageInJourney',
 'dataCenter',
 'id',
 'negativeSampling',
 'positiveSampling',
 'pyModelEvidence',
 'pyModelPerformance',
 'pyModelPositives',
 'pyPropensity',
 'rulesetName',
 'rulesetVersion']
In [103]:
df = df.unique(subset=['Decision_InteractionID', 'Context_Treatment'], keep='last')
In [104]:
df = df.with_columns(
    pl.when(pl.col(pl.String).str.len_chars() == 0)
    .then(None)
    .otherwise(pl.col(pl.String))
    .name.keep()
    ).with_columns(
        cs.ends_with("_DaysSince", 
                     "_pyHistoricalOutcomeCount",
                     "DaysinCurrentStage")
                     .cast(pl.Float64).fill_null(0),
        pl.col(
            [
                "Customer_AnnualIncome",
                "Customer_CreditScore",
                "Customer_DebtToIncomeRatio",
                "Customer_NetWealth",
                "Customer_RelationshipLengthDays",
                "Customer_TotalAssets",
                "Customer_TotalLiabilities",
                "Customer_BirthDate"
            ]
            )
        .cast(pl.Float64)
        .fill_null(0),
        cs.starts_with("Customer_Num").cast(pl.Float64).fill_null(0),
        cs.starts_with("Context_").cast(pl.String),
        cs.starts_with("Customer_Is").replace_strict({"false":False, "true":True, "null":False, "False":False, "True":True}),
        cs.starts_with("Customer_Has").replace_strict({"false":False, "true":True, "null":False, "False":False, "True":True})
    ).with_columns(
        cs.starts_with("Customer_Is").fill_null(False).cast(pl.Boolean),
        cs.starts_with("Customer_Has").fill_null(False).cast(pl.Boolean)
    ).with_columns(
        pl.col(
            [
                "Customer_AnnualIncome",
                "Customer_CreditScore",
                "Customer_DebtToIncomeRatio",
                "Customer_NetWealth",
                "Customer_RelationshipLengthDays",
                "Customer_TotalAssets",
                "Customer_TotalLiabilities"
            ]
        ).cast(pl.Float64).fill_null(0),
    )
In [105]:
df = df.drop(["rulesetVersion", "id", "dataCenter", "negativeSampling", "positiveSampling", "rulesetName",
                "Decision_SubjectID", "Decision_OutcomeTime", "Decision_Rank", "Decision_InteractionID",
                "Decision_DecisionTime", "Decision_OutcomeWeight", "pyModelEvidence", "pyModelPerformance", 
                "pyModelPositives", "pyPropensity", "rulesetVersion"])
In [106]:
cat_features = list()
schema = df.collect_schema()

for cname in schema.names():
    ctype = schema[cname]
    if(not(cname.startswith("Decision_")) and pl.String.is_(ctype)):
        df = df.with_columns(pl.col(cname).fill_null('N/A'))
        cat_features.append(cname)
print(cat_features)
['IH_Retail_Inbound_Impression_pxLastGroupID', 'Customer_PrimaryState', 'Customer_ResidentialStatus', 'Customer_PrimaryMobilePhone', 'Context_Group', 'IH_Web_Inbound_Clicked_pxLastGroupID', 'Param_Journey', 'Context_Issue', 'Customer_MKTCLVValue', 'Customer_OrganizationName', 'Customer_CLV', 'Customer_PrimaryCountry', 'Context_Name', 'Param_JourneyStage', 'Param_LastJourneyStage', 'IH_Web_Inbound_Impression_pxLastGroupID', 'Customer_OwnershipStatus', 'Param_PriorStageInJourney', 'Context_Treatment', 'Customer_LinkedIn', 'Customer_Prefix', 'Customer_PrimaryCountryCode', 'Customer_PrimaryPostalCode', 'Customer_OrganizationID', 'Customer_PrimaryCity', 'Context_Direction', 'Context_Channel', 'Customer_OwnedAccountTypes']
In [107]:
text_processing_options = {
    "tokenizers": [{
        "tokenizer_id": "comma",
        "delimiter": ",",
        "lowercasing": "true"
    }],

    "dictionaries": [{
        "dictionary_id": "Word",
        "gram_order": "1"
    }],

    "feature_processing": {
        "default": [{
            "dictionaries_names": ["Word"],
            "feature_calcers": ["BoW"],
            "tokenizers_names": ["comma"]
        }]
    }
}
text_features = ['Customer_OwnedAccountTypes']
In [108]:
cat_features = list(set(cat_features) - set(text_features))
In [109]:
df = df.collect()
df.head()
Out[109]:
shape: (5, 64)
Customer_IsInArrearsCustomer_IsProspectIH_Retail_Inbound_Impression_pxLastGroupIDCustomer_IsActiveMilitaryServiceCustomer_IsInCollectionsCustomer_IsStudentIH_Web_Inbound_Impression_pyHistoricalOutcomeCountCustomer_IsBankruptcyCustomer_NumCreditCardAccountCustomer_DebtToIncomeRatioCustomer_IsInDisasterAreaCustomer_IsCreditScoreStaleCustomer_NumDepositAccountCustomer_PrimaryStateCustomer_IsInActiveComplaintCustomer_ResidentialStatusCustomer_PrimaryMobilePhoneCustomer_IsFinanciallyVulnerableContext_GroupIH_Web_Inbound_Clicked_pxLastGroupIDParam_JourneyCustomer_IsB2CContext_IssueCustomer_MKTCLVValueCustomer_NetWealthCustomer_TotalLiabilitiesCustomer_OrganizationNameCustomer_HasBrokenPromiseCustomer_CLVCustomer_RelationshipLengthDaysCustomer_PrimaryCountryCustomer_TotalAssetsCustomer_CreditScoreContext_NameParam_JourneyStageParam_LastJourneyStageCustomer_IsInPrecollectionsCustomer_HasCriticalIllnessIH_Web_Inbound_Impression_pxLastGroupIDIH_Web_Inbound_Clicked_pyHistoricalOutcomeCountIH_Retail_Inbound_Impression_pyHistoricalOutcomeCountCustomer_OwnershipStatusDecision_OutcomeCustomer_AnnualIncomeParam_PriorStageInJourneyContext_TreatmentCustomer_LinkedInCustomer_PrefixIH_Web_Inbound_Impression_pxLastOutcomeTime_DaysSinceIH_Retail_Inbound_Impression_pxLastOutcomeTime_DaysSinceCustomer_PrimaryCountryCodeCustomer_NumInvestmentAccountCustomer_PrimaryPostalCodeCustomer_IsCustomerActiveCustomer_IsIncarceratedCustomer_OrganizationIDCustomer_NumLoanAccountCustomer_BirthDateCustomer_PrimaryCityParam_DaysinCurrentStageIH_Web_Inbound_Clicked_pxLastOutcomeTime_DaysSinceContext_DirectionContext_ChannelCustomer_OwnedAccountTypes
boolboolstrboolboolboolf64boolf64f64boolboolf64strboolstrstrboolstrstrstrboolstrstrf64f64strboolstrf64strf64f64strstrstrboolboolstrf64f64strstrf64strstrstrstrf64f64strf64strboolboolstrf64f64strf64f64strstrstr
falsefalse"N/A"falsefalsefalse13.0false2.032.0falsefalse2.0"N/A"false"N/A""N/A"false"Creditcards""Creditcards""N/A"false"Grow""N/A"0.00.0"N/A"false"High"2468.0"N/A"0.09.0"VisaClassic""N/A""N/A"falsefalse"Creditcards"17.00.0"N/A""Clicked"0.0"N/A""Hero Web""N/A""N/A"0.0402670.0"N/A"1.0"N/A"falsefalse"N/A"1.00.0"N/A"0.00.001257"Inbound""Web""N/A"
falsefalse"N/A"falsefalsefalse25.0false4.036.0falsefalse2.0"N/A"false"N/A""N/A"false"Creditcards""Creditcards""N/A"false"Grow""N/A"4.05488e6447574.0"N/A"false"Loyal"3800.0"N/A"1.990639e64.0"VisaClassic""N/A""N/A"falsefalse"Creditcards"32.00.0"N/A""Clicked"496508.0"N/A""Hero Web""N/A""N/A"0.0050050.0"N/A"1.0"N/A"falsefalse"N/A"0.00.33066"N/A"0.00.005339"Inbound""Web""N/A"
falsefalse"N/A"falsefalsefalse19.0false0.02.0falsefalse0.0"N/A"false"N/A""N/A"false"Creditcards""Creditcards""N/A"false"Grow""N/A"0.00.0"N/A"false"N/A"303.0"N/A"0.00.0"VisaClassic""N/A""N/A"falsefalse"Creditcards"14.00.0"N/A""Clicked"0.0"N/A""Hero Web""N/A""N/A"0.0020630.0"N/A"0.0"N/A"falsefalse"N/A"0.00.0"N/A"0.00.007697"Inbound""Web""N/A"
falsefalse"N/A"falsefalsetrue19.0false2.025.0falsefalse0.0"N/A"false"N/A""N/A"false"Creditcards""Creditcards""N/A"false"Grow""N/A"0.00.0"N/A"false"High"2053.0"N/A"0.08.0"VisaClassic""N/A""N/A"falsefalse"Creditcards"29.00.0"N/A""NoResponse"0.0"N/A""Hero Web""N/A""N/A"0.2098140.0"N/A"0.0"N/A"falsefalse"N/A"2.00.0"N/A"0.00.008526"Inbound""Web""N/A"
falsefalse"N/A"falsefalsefalse29.0false5.010.0falsefalse0.0"N/A"false"N/A""N/A"false"Creditcards""Creditcards""N/A"false"Grow""N/A"0.00.0"N/A"false"High"1563.0"N/A"0.02.0"VisaClassic""N/A""N/A"falsefalse"Creditcards"13.00.0"N/A""Clicked"0.0"N/A""Hero Web""N/A""N/A"0.0003020.0"N/A"0.0"N/A"falsefalse"N/A"1.00.0"N/A"0.00.003019"Inbound""Web""N/A"

Train Model¶

  • Automatic handling of categorical features without manual encoding
  • Built‑in text processing (those properties cannot be used by ADM currently)

XGB model is trained to understand which features perform well and which not, when compared to ootb NB ADM. Text processing can be used to enhance model performance (in this example one property contains CSV lists, which cannot be processed by ADM, but, if this analysis reveals them as powerful predictors, can be further encoded and added to ADM models as predictors).

In [110]:
dset = df.to_pandas()
y = dset['Decision_Outcome']
X = dset.drop(['Decision_Outcome'], axis=1)
seed = 127
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=test_size, random_state=seed)
In [111]:
params = {'loss_function': 'Logloss',  # objective function
          'eval_metric': 'AUC',  # metric
          'verbose': 50,  # output to stdout info about training process every 50 iterations
          'random_seed': seed,
          'cat_features': cat_features,
          'text_features': text_features,
          'text_processing': text_processing_options,
          'one_hot_max_size': 1023,
          'class_names': ['NoResponse', 'Clicked'],
          'iterations': 100,
          'learning_rate': 0.5,
          'depth': 8
          }
In [112]:
%%time
cbc_1 = CatBoostClassifier(**params)
cbc_1.fit(X=X_train, y=y_train,  # data to train on (required parameters, unless we provide X as a pool object, will be shown below)
          eval_set=(X_val, y_val),  # data to validate on
          # True if we don't want to save trees created after iteration with the best validation score
          use_best_model=True,
          # True for visualization of the training process (it is not shown in a published kernel - try executing this code)
          plot=True
          )
MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))
0:	test: 0.8568634	best: 0.8568634 (0)	total: 8.22ms	remaining: 814ms
50:	test: 0.8808364	best: 0.8851585 (8)	total: 330ms	remaining: 317ms
99:	test: 0.8747479	best: 0.8851585 (8)	total: 619ms	remaining: 0us

bestTest = 0.8851585395
bestIteration = 8

Shrink model to first 9 iterations.
CPU times: user 2.78 s, sys: 697 ms, total: 3.48 s
Wall time: 713 ms
Out[112]:
<catboost.core.CatBoostClassifier at 0x16e4c1350>

Review Model Parameters¶

In [113]:
pool = Pool(X_test, y_test, cat_features=cat_features, text_features=text_features)
#pool = Pool(X_test, y_test, cat_features=cat_features)
In [114]:
cbc_1.get_all_params()
Out[114]:
{'nan_mode': 'Min',
 'eval_metric': 'AUC',
 'combinations_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=1:TargetBorderType=MinEntropy:Prior=0/1:Prior=0.5/1:Prior=1/1',
  'Counter:CtrBorderCount=15:CtrBorderType=Uniform:Prior=0/1'],
 'iterations': 100,
 'sampling_frequency': 'PerTree',
 'fold_permutation_block': 0,
 'leaf_estimation_method': 'Newton',
 'random_score_type': 'NormalWithModelSizeDecrease',
 'counter_calc_method': 'SkipTest',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'ctr_leaf_count_limit': 18446744073709551615,
 'bayesian_matrix_reg': 0.10000000149011612,
 'one_hot_max_size': 1023,
 'eval_fraction': 0,
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': False,
 'max_ctr_complexity': 1,
 'model_size_reg': 0.5,
 'simple_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=1:TargetBorderType=MinEntropy:Prior=0/1:Prior=0.5/1:Prior=1/1',
  'Counter:CtrBorderCount=15:CtrBorderType=Uniform:Prior=0/1'],
 'pool_metainfo_options': {'tags': {}},
 'subsample': 0.800000011920929,
 'use_best_model': True,
 'class_names': ['NoResponse', 'Clicked'],
 'random_seed': 127,
 'depth': 8,
 'ctr_target_border_count': 1,
 'posterior_sampling': False,
 'has_time': False,
 'store_all_simple_ctr': False,
 'border_count': 254,
 'classes_count': 0,
 'auto_class_weights': 'None',
 'sparse_features_conflict_fraction': 0,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'best_model_min_trees': 1,
 'model_shrink_rate': 0,
 'min_data_in_leaf': 1,
 'text_processing': {'dictionaries': [{'start_token_id': '0',
    'occurrence_lower_bound': '5',
    'skip_step': '0',
    'end_of_word_token_policy': 'Insert',
    'token_level_type': 'Word',
    'end_of_sentence_token_policy': 'Skip',
    'gram_order': '1',
    'max_dictionary_size': '50000',
    'dictionary_id': 'Word'}],
  'feature_processing': {'default': [{'dictionaries_names': ['Word'],
     'feature_calcers': ['BoW:calcer_type="BoW"'],
     'tokenizers_names': ['comma']}]},
  'tokenizers': [{'number_token': '🔢',
    'skip_empty': '1',
    'number_process_policy': 'LeaveAsIs',
    'tokenizer_id': 'comma',
    'token_types': ['Number', 'Unknown', 'Word'],
    'delimiter': ',',
    'languages': [],
    'lemmatizing': '0',
    'split_by_set': '0',
    'lowercasing': '1',
    'subtokens_policy': 'SingleToken',
    'separator_type': 'ByDelimiter'}]},
 'loss_function': 'Logloss',
 'learning_rate': 0.5,
 'score_function': 'Cosine',
 'task_type': 'CPU',
 'leaf_estimation_iterations': 10,
 'bootstrap_type': 'MVS',
 'max_leaves': 256,
 'permutation_count': 4}
In [115]:
cbc_1.plot_tree(
    tree_idx=1,
    pool=pool
)
Out[115]:
No description has been provided for this image
In [116]:
feature_importance = cbc_1.get_feature_importance(data=pool,
                                                  prettified=True,
                                                  verbose=True, type="PredictionValuesChange")
feature_importance
Used dataset leave statistics for fstr calculation
Out[116]:
Feature Id Importances
0 Customer_CLV 36.866382
1 Customer_CreditScore 19.152898
2 Customer_DebtToIncomeRatio 17.961440
3 Customer_RelationshipLengthDays 9.572969
4 IH_Web_Inbound_Impression_pxLastOutcomeTime_Da... 3.770789
... ... ...
58 Customer_OrganizationID 0.000000
59 Customer_PrimaryCity 0.000000
60 Param_DaysinCurrentStage 0.000000
61 Context_Direction 0.000000
62 Context_Channel 0.000000

63 rows × 2 columns

In [117]:
feature_importance = cbc_1.get_feature_importance(data=pool,
                                                  prettified=True,
                                                  verbose=True, type="LossFunctionChange")
feature_importance
Used Logloss metric for fstr calculation
Selected 3210 documents from 3210 for LossFunctionChange calculation.
Used Logloss metric for fstr calculation
Started LossFunctionChange calculation
3210/3210 Process documents	passed time: 6.72ms	remaining time: 0us
Out[117]:
Feature Id Importances
0 Customer_OwnedAccountTypes 0.147050
1 Customer_CreditScore 0.060189
2 Customer_DebtToIncomeRatio 0.054758
3 Customer_RelationshipLengthDays 0.030252
4 IH_Web_Inbound_Clicked_pxLastOutcomeTime_DaysS... 0.005299
... ... ...
58 IH_Web_Inbound_Impression_pyHistoricalOutcomeC... -0.000293
59 Customer_NumCreditCardAccount -0.000321
60 Customer_TotalAssets -0.000521
61 Customer_NumLoanAccount -0.000533
62 Customer_BirthDate -0.000736

63 rows × 2 columns

In [118]:
# make the prediction using the resulting model
preds = cbc_1.predict(pool)
preds_proba = cbc_1.predict_proba(pool)
print(preds_proba[:5])
print(cbc_1.predict(pool, 'RawFormulaVal')[:5])
[[0.99580922 0.00419078]
 [0.96839389 0.03160611]
 [0.11126779 0.88873221]
 [0.61395444 0.38604556]
 [0.99759843 0.00240157]]
[-5.47066774 -3.42228835  2.07785618 -0.46396535 -6.02922831]
In [119]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test, preds, labels=params.get('class_names')))
print(metrics.classification_report(
    y_test, preds, labels=params.get('class_names')))
[[ 945  292]
 [ 377 1596]]
              precision    recall  f1-score   support

  NoResponse       0.71      0.76      0.74      1237
     Clicked       0.85      0.81      0.83      1973

    accuracy                           0.79      3210
   macro avg       0.78      0.79      0.78      3210
weighted avg       0.80      0.79      0.79      3210

In [120]:
from catboost.utils import get_roc_curve
from sklearn.metrics import auc

curve = get_roc_curve(cbc_1, pool)
(fpr, tpr, thresholds) = curve
roc_auc = auc(fpr, tpr)

import matplotlib.pyplot as plt

plt.figure(figsize=(16, 8))
lw = 2

plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc, alpha=0.5)

plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--', alpha=0.5)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(True)
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('Receiver operating characteristic', fontsize=20)
plt.legend(loc="lower right", fontsize=16)
plt.show()
No description has been provided for this image
In [121]:
print('error:', 1-np.mean(preds == np.ravel(y_test)))
error: 0.20841121495327097
In [122]:
rmse_learn = pd.read_csv(
    'catboost_info/learn_error.tsv', header=0, delimiter='\t')
rmse_test = pd.read_csv('catboost_info/test_error.tsv',
                        header=0, delimiter='\t')
plt.plot(rmse_learn['Logloss'], label="Learn Error")
plt.plot(rmse_test['Logloss'], label="Test Error")
Out[122]:
[<matplotlib.lines.Line2D at 0x310ee2e90>]
No description has been provided for this image

Model Analysis¶

In [123]:
shap.initjs()
No description has been provided for this image
In [124]:
shap_values = cbc_1.get_feature_importance(pool, type="ShapValues")
In [125]:
expected_value = shap_values[0, -1]
shap_values = shap_values[:, :-1]
In [126]:
shap.summary_plot(shap_values, X_test, max_display=20, plot_size=[14,10])
No description has been provided for this image
In [127]:
shap.summary_plot(shap_values, X_test, plot_type="bar", plot_size=[14,10])
No description has been provided for this image

Prediction Explanations¶

In [128]:
shap.plots.force(expected_value, shap_values[50], feature_names=X_test.columns)
Out[128]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.

Individual Feature Analysis¶

In [129]:
feature = ['Customer_DebtToIncomeRatio', 'Customer_CreditScore', 'Customer_CLV', 'Customer_RelationshipLengthDays']
res = cbc_1.calc_feature_statistics(X_test, y_test, feature, plot=True)

Analyse Model Without Text Features¶

In [130]:
dset = df.to_pandas()
y = dset['Decision_Outcome']
X = dset.drop(['Decision_Outcome'] + text_features, axis=1)
seed = 127
test_size = 0.2
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X, y, test_size=0.1, random_state=seed)
X_train2, X_val2, y_train2, y_val2 = train_test_split(
    X_train2, y_train2, test_size=test_size, random_state=seed)
In [131]:
params = {'loss_function': 'Logloss',  # objective function
          'eval_metric': 'AUC',  # metric
          'verbose': 50,  # output to stdout info about training process every 50 iterations
          'random_seed': seed,
          'cat_features': cat_features,
          'class_names': ['NoResponse', 'Clicked'],
          'iterations': 100,
          'learning_rate': 0.5,
          'depth': 8
          }
In [132]:
%%time
cbc_2 = CatBoostClassifier(**params)
cbc_2.fit(X=X_train2, y=y_train2,  # data to train on (required parameters, unless we provide X as a pool object, will be shown below)
          eval_set=(X_val2, y_val2),  # data to validate on
          # True if we don't want to save trees created after iteration with the best validation score
          use_best_model=True,
          # True for visualization of the training process (it is not shown in a published kernel - try executing this code)
          plot=True
          )
MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))
0:	test: 0.8241185	best: 0.8241185 (0)	total: 10.8ms	remaining: 1.06s
50:	test: 0.8816126	best: 0.8857634 (23)	total: 429ms	remaining: 412ms
99:	test: 0.8747540	best: 0.8857634 (23)	total: 821ms	remaining: 0us

bestTest = 0.8857633624
bestIteration = 23

Shrink model to first 24 iterations.
CPU times: user 4.69 s, sys: 831 ms, total: 5.52 s
Wall time: 935 ms
Out[132]:
<catboost.core.CatBoostClassifier at 0x303e1fad0>
In [133]:
def print_score_diff(first_model, second_model):
    first_accuracy = first_model.best_score_['validation']['AUC']
    second_accuracy = second_model.best_score_['validation']['AUC']

    gap = (second_accuracy - first_accuracy) / first_accuracy * 100

    print('{} vs {} ({:+.2f}%)'.format(first_accuracy, second_accuracy, gap))
print('Model AUC difference - without text features vs with text features.')
print_score_diff(cbc_2, cbc_1)
Model AUC difference - without text features vs with text features.
0.8857633624228525 vs 0.8851585394538517 (-0.07%)
In [134]:
explainer = shap.TreeExplainer(cbc_2)
shap_values_exp = explainer(X_test2)
In [135]:
shap.plots.bar(shap_values_exp)
No description has been provided for this image
In [136]:
shap.plots.beeswarm(shap_values_exp)
No description has been provided for this image
In [137]:
shap.plots.force(explainer(X_test2.sample(n=500, random_state=seed)))
Out[137]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [138]:
shap.dependence_plot("Customer_CLV", shap_values_exp.values, X_test2, interaction_index="Customer_DebtToIncomeRatio")
No description has been provided for this image

Individual Predition Explanation¶

In [139]:
shap.plots.force(shap_values_exp[8])
Out[139]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [140]:
shap.plots.waterfall(shap_values_exp[8])
No description has been provided for this image
In [141]:
preds_proba = cbc_2.predict_proba(X_test2.iloc[8])
print(preds_proba)
[0.00331373 0.99668627]
In [142]:
shap.decision_plot(
    base_value=np.array([explainer.expected_value]),
    shap_values=explainer.shap_values(X_test2)[8],
    features=X_test2.columns
)
No description has been provided for this image

Feature dependency¶

In [143]:
feature = 'Customer_CLV'
shap.plots.scatter(shap_values_exp[:, feature], color=shap_values_exp[:, "Customer_CreditScore"])
No description has been provided for this image

Using global feature importance orderings¶

In [144]:
shap.plots.scatter(shap_values_exp[:, shap_values_exp.abs.mean(0).argsort[-1]], alpha=0.2)
No description has been provided for this image

Model Calibration Quality¶

In [145]:
import numpy as np

# Calibration curves
def calibration(groundtruth, probs):
    # Convert groundtruth to binary and ensure probabilities are in a DataFrame
    groundtruth_binary = 1*np.array(groundtruth)
    nlabels = len(np.unique(groundtruth))
    
    if nlabels < 2:
        return pl.DataFrame({
            "MeanProbs": [0.5],
            "PositivesShare": [None],
            "binPos": [None],
            "binNeg": [None]
        })

    if nlabels > 2:
        raise ValueError("'groundtruth' has more than two levels.")
    
    # Create probabilities DataFrame with binning
    probabilities = pl.DataFrame({
        "groundtruth": groundtruth_binary,
        "probs": probs
    })

    # Group and summarize probabilities
    grouped_probabilities = (probabilities
                             .with_columns((pl.col("probs") * 10).round().alias("bin"))  # Binning probs to 1 decimal place
                             .group_by("bin")
                             .agg([
                                 pl.mean("probs").alias("MeanProbs"),
                                 pl.sum("groundtruth").alias("binPos"),
                                 (pl.count("groundtruth") - pl.sum("groundtruth")).alias("binNeg"),
                                 (pl.sum("groundtruth") / pl.count("groundtruth")).alias("PositivesShare")
                             ])
                             .sort("bin"))
    return grouped_probabilities
In [146]:
y_test_bin = y_test.apply(lambda x: x == 'Clicked')
preds_proba = cbc_1.predict_proba(X_test)
calibration_data = calibration(y_test_bin, preds_proba[:,1])
In [147]:
import plotly.express as px
import plotly.graph_objects as go

fig = px.line(calibration_data, 
              x="MeanProbs", 
              y="PositivesShare")


# Add ideal calibration line (diagonal)
fig.add_shape(type="line", line=dict(dash='dash', color="darkred"), row='all', col='all', x0=0, y0=0, x1=1, y1=1)

# Customize the layout and labels
fig.update_layout(
    title="Model calibration plot",
    xaxis_title="Mean predicted probability",
    yaxis_title="Fraction of positives"
)

fig.show()